# -*- coding: utf-8 -*-
"""
For preprocessing the ipums data (full census data)

"""

import sys
import argparse
import string
import codecs
from os import path
import random
from collections import defaultdict
import numpy as np
import operator
import pandas as pd
import glob

random.seed(1776)


def new_county_info():
    return {"totalT": 0.0, "totalNotT": 0.0, "N": 0.0, "totalW1": 0.0, "totalW2": 0.0, "totalX": 0, "totalNotX": 0}

def to_float_or_default_field_value(field_value):
    if field_value == "Suppressed":
        return 0.0
    return float(field_value)

def tof(value):
    return float(value)

def dd(n, d):
    """
    default division
    :return: division, else 0.0
    """
    return n / d if d else 0.0


def ddf(n, d):
    """
    default division, converting both arguments to float
    :return: division, else 0.0
    """
    return float(n) / float(d) if d else 0.0




def save_lines(filename_with_path, list_of_lists):   
    with codecs.open(filename_with_path, "w", encoding="utf-8") as f:
        f.writelines(list_of_lists)


def get_int_rowval(row, index_into_row):
    rowval = None
    try:
        rowval = int(row[index_into_row])
    except:
        pass
    return rowval

def get_float_rowval(row, index_into_row):
    rowval = None
    try:
        rowval = float(row[index_into_row])
    except:
        pass
    return rowval



def save_spreadsheet(counties, county_dict, output_dir, label):

    lines = []
    lines.append(",".join(["X", "notX", "T", "notT", "W1", "W2", "N"]) + "\n")
    for county in counties:
        county_info = county_dict[county]
        # calculate true betas:
        assert county_info["totalX"] + county_info["totalNotX"] == county_info["N"]
        # ["X", "notX", "T", "notT", "W1", "W2", "N"])
        formatted_line = [
            ddf(county_info["totalX"], county_info["N"]),
            ddf(county_info["totalNotX"], county_info["N"]),
            ddf(county_info["totalT"], county_info["N"]),
            ddf(county_info["totalNotT"], county_info["N"]),
            ddf(county_info["totalW1"], county_info["totalX"]),
            ddf(county_info["totalW2"], county_info["totalNotX"]),
            county_info["N"]
        ]

        lines.append(",".join(["%f" % val for val in formatted_line]) + "\n")

    save_lines(path.join(output_dir, label), lines)


def process_and_save_data(filepath_with_name, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET, TVARVAL_1, XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET, use_mcd=False):
    current_dataset_index = 0
    year_label = year_labels[current_dataset_index]
    label = GEN_LABEL.format(year_label=year_label)

    ctr = 0

    seen_year = False
    # 'county' here refers to the geographic unit
    counties = defaultdict(int)
    county_dict = defaultdict(int)  # index: county id; value: new_county_info()

    hhwt_dict = defaultdict(int)
    perwt_dict = defaultdict(int)

    households = defaultdict(int)
    persons = defaultdict(int)


    with codecs.open(filepath_with_name, encoding="utf-8") as f:
        for line in f:
            if ctr % 5000000 == 0:
                print "Line %d" % ctr
            row = line.strip().split(",")
            if ctr == 0:  # header
                header = [x.strip('"') for x in row]
                print header
                year_index = header.index("YEAR")

                state_index = header.index("STATEICP")
                county_index = header.index("COUNTY")
                if use_mcd:
                    mcd_index = header.index("MCD")

                xvar_index = header.index(XVARVAL_LABEL)
                tvar_index = header.index(TVARVAL_LABEL)

                hhwt_index = header.index("HHWT")
                perwt_index = header.index("PERWT")

                # check that each person is included only once
                #A combination of YEAR, DATANUM, and SERIAL provides a unique identifier for every household in the IPUMS; the combination of YEAR, DATANUM, SERIAL, and PERNUM uniquely identifies every person in the database.
                datanum_index = header.index("DATANUM")
                serial_index = header.index("SERIAL")  # household
                pernum_index = header.index("PERNUM")  # person


            else:  # body
                line = line.strip().split(",")
                year = get_int_rowval(row, year_index)
                county_id = get_int_rowval(row, county_index)
                state = get_int_rowval(row, state_index)

                datanum = get_int_rowval(row, datanum_index)
                serial = get_int_rowval(row, serial_index)
                pernum = get_int_rowval(row, pernum_index)

                xvar = get_int_rowval(row, xvar_index)
                tvar = get_int_rowval(row, tvar_index)

                hhwt = get_float_rowval(row, hhwt_index)
                perwt = get_float_rowval(row, perwt_index)

                if year != int(year_label) and seen_year:
                    save_spreadsheet(counties, county_dict, output_dir, label)
                    print "Finished processing year", year_label
                    print "Final ctr", ctr
                    print "Number of counties", len(counties)

                    print "hhwt_dict:", hhwt_dict
                    print "perwt_dict:", perwt_dict

                    print "Number of unique households:", len(households)
                    print "Number of unique people:", len(persons)

                    hhwt_dict = defaultdict(int)
                    perwt_dict = defaultdict(int)

                    counties = defaultdict(int)
                    county_dict = defaultdict(int)

                    households = defaultdict(int)
                    persons = defaultdict(int)

                    current_dataset_index += 1
                    if current_dataset_index > len(year_labels) - 1:
                        break
                    year_label = year_labels[current_dataset_index]
                    label = GEN_LABEL.format(year_label=year_label)
                    seen_year = False

                if year == int(year_label):
                    seen_year = True

                    if xvar in XVARVAL_SET and tvar in TVARVAL_SET:

                        hhwt_dict[hhwt] += 1
                        perwt_dict[perwt] += 1

                        if use_mcd:
                            mcd = get_int_rowval(row, mcd_index)
                            county = "%d-%d-%d" % (state, county_id, mcd)
                        else:
                            # a county is uniquely identified by the county id and the state id
                            county = "%d-%d" % (state, county_id)
                        counties[county] += 1

                        household = "%d-%d-%d" % (year, datanum, serial)
                        households[household] += 1

                        person = "%d-%d-%d-%d" % (year, datanum, serial, pernum)
                        persons[person] += 1
                        if persons[person] > 1:
                            print "WARNING: person %s occurs with frequency %d in line:" % (person, persons[person]), line

                        # add data:
                        if county not in county_dict:
                            county_info = new_county_info()
                        else:
                            county_info = county_dict[county]

                        if xvar == XVARVAL_1:
                            county_info["totalX"] += 1.0
                        else:
                            county_info["totalNotX"] += 1.0

                        if tvar == TVARVAL_1:
                            county_info["totalT"] += 1.0
                        else:
                            county_info["totalNotT"] += 1.0

                        if xvar == XVARVAL_1 and tvar == TVARVAL_1:
                            county_info["totalW1"] += 1.0
                        if xvar != XVARVAL_1 and tvar == TVARVAL_1:
                            county_info["totalW2"] += 1.0

                        county_info["N"] += 1.0

                        county_dict[county] = county_info

            ctr += 1




def main(arguments):

    parser = argparse.ArgumentParser(description=__doc__,
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('--run_set', help="run_set")
    parser.add_argument('--input_csv_data_file', help="input_csv_data_file")
    parser.add_argument('--output_dir', help="output_dir")

    args = parser.parse_args(arguments)
    run_set = args.run_set
    input_csv_data_file = args.input_csv_data_file
    output_dir = args.output_dir

    run_set = [int(x) for x in run_set.strip().split(",")]
    print "Note that in the current version, the final year will be cutoff without adding a final line to the .csv (see README)"
    print "Considering the following run set:", run_set



    for run_number in run_set:
        print "Currently processing run", run_number
        if run_number == 1:
            year_labels = "1850,1900,1910,1920,1930".split(",")
            GEN_LABEL = "ipums_full_census_year{year_label}_menwomen_literate_notliterate__x-is-men_t-is-LIT.csv"

            XVARVAL_1 = 1  # Male
            XVARVAL_2 = 2  # Female

            TVARVAL_1 = 4  # Yes, literate (reads and writes)

            XVARVAL_LABEL = "SEX"
            TVARVAL_LABEL = "LIT"

            XVARVAL_SET = [XVARVAL_1, XVARVAL_2]
            TVARVAL_SET = [1, 2, 3, 4]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET, TVARVAL_1,
                                  XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET)
        elif run_number == 2:
            year_labels = "1850,1900,1910,1920,1930".split(",")
            GEN_LABEL = "ipums_full_census_year{year_label}_blackwhite_literate_notliterate__x-is-black_t-is-LIT.csv"

            XVARVAL_1 = 2  # BLACK
            XVARVAL_2 = 1  # WHITE

            TVARVAL_1 = 4  # Yes, literate (reads and writes)

            XVARVAL_LABEL = "RACE"
            TVARVAL_LABEL = "LIT"

            XVARVAL_SET = [XVARVAL_1, XVARVAL_2]
            TVARVAL_SET = [1, 2, 3, 4]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET, TVARVAL_1,
                                  XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET)

        elif run_number == 3:
            year_labels = "1850,1880,1910,1920,1930,1940".split(",")
            GEN_LABEL = "ipums_full_census_year{year_label}_blackwhite_notinlaborforce_inlaborforce__x-is-black_t-is-LABFORCE.csv"

            XVARVAL_1 = 2  # BLACK
            XVARVAL_2 = 1  # WHITE

            TVARVAL_1 = 1  # No, not in the labor force
            TVARVAL_2 = 2  # Yes, in the labor force

            XVARVAL_LABEL = "RACE"
            TVARVAL_LABEL = "LABFORCE"

            XVARVAL_SET = [XVARVAL_1, XVARVAL_2]
            TVARVAL_SET = [TVARVAL_1,TVARVAL_2]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET, TVARVAL_1,
                                  XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET)
        elif run_number == 4:
            year_labels = "1850,1880,1910,1920,1930,1940".split(",")
            GEN_LABEL = "ipums_full_census_year{year_label}_menwomen_notinlaborforce_inlaborforce__x-is-men_t-is-LABFORCE.csv"

            XVARVAL_1 = 1  # Male
            XVARVAL_2 = 2  # Female

            TVARVAL_1 = 1  # No, not in the labor force
            TVARVAL_2 = 2  # Yes, in the labor force

            XVARVAL_LABEL = "SEX"
            TVARVAL_LABEL = "LABFORCE"

            XVARVAL_SET = [XVARVAL_1, XVARVAL_2]
            TVARVAL_SET = [TVARVAL_1,TVARVAL_2]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET, TVARVAL_1,
                                  XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET)

        elif run_number == 5:
            year_labels = "1900,1910,1920,1930".split(",")
            GEN_LABEL = "ipums_full_census_year{year_label}_foreignborn_nativeborn_literate_notliterate_x-is-foreignborn_t-is-LIT.csv"

            XVARVAL_1 = 5  # Foreign born

            TVARVAL_1 = 4  # Yes, literate (reads and writes)

            XVARVAL_LABEL = "NATIVITY"
            TVARVAL_LABEL = "LIT"

            XVARVAL_SET = [1, 2, 3, 4, 5]
            TVARVAL_SET = [1, 2, 3, 4]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET, TVARVAL_1,
                                  XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET)

        elif run_number == 6:
            #year_labels = "1900,1910,1920,1930".split(",")
            # using above, but shortcircuit here to finish killed run:
            year_labels = "1930".split(",")
            GEN_LABEL = "ipums_full_census_year{year_label}_foreignborn_nativeborn_speaksenglish_doesnotspeakenglish_x-is-foreignborn_t-is-SPEAKENG.csv"

            XVARVAL_1 = 5  # Foreign born

            # for these years (1900-1930), only 1,2 are an option
            TVARVAL_1 = 2  # Yes, speaks English...
            TVARVAL_2 = 1  # Does not speak English

            XVARVAL_LABEL = "NATIVITY"
            TVARVAL_LABEL = "SPEAKENG"

            XVARVAL_SET = [1, 2, 3, 4, 5]
            TVARVAL_SET = [TVARVAL_1, TVARVAL_2]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET, TVARVAL_1,
                                  XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET)

        elif run_number == 7:
            year_labels = "1880,1900,1910,1920,1930,1940".split(",")
            GEN_LABEL = "ipums_full_census_year{year_label}_blackwhite_foreignborn_nativeborn__x-is-black_t-is-NATIVITY.csv"

            XVARVAL_1 = 2  # BLACK
            XVARVAL_2 = 1  # WHITE

            TVARVAL_1 = 5  # Foreign born

            XVARVAL_LABEL = "RACE"
            TVARVAL_LABEL = "NATIVITY"

            XVARVAL_SET = [XVARVAL_1, XVARVAL_2]
            TVARVAL_SET = [1, 2, 3, 4, 5]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET, TVARVAL_1,
                                  XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET)
        elif run_number == 8:
            year_labels = "1880,1900,1910,1920,1930,1940".split(",")
            GEN_LABEL = "ipums_full_census_year{year_label}_menwomen_foreignborn_nativeborn__x-is-men_t-is-NATIVITY.csv"

            XVARVAL_1 = 1  # Male
            XVARVAL_2 = 2  # Female

            TVARVAL_1 = 5  # Foreign born

            XVARVAL_LABEL = "SEX"
            TVARVAL_LABEL = "NATIVITY"

            XVARVAL_SET = [XVARVAL_1, XVARVAL_2]
            TVARVAL_SET = [1, 2, 3, 4, 5]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET, TVARVAL_1,
                                  XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET)

        elif run_number == 9:
            #year_labels = "1910,1920,1930,1940".split(",")
            # using above, but shortcircuit here to finish killed run:
            year_labels = "1920,1930,1940".split(",")
            GEN_LABEL = "ipums_full_census_year{year_label}_menwomen_selfemployed_worksforwages__x-is-men_t-is-CLASSWKR.csv"

            XVARVAL_1 = 1  # Male
            XVARVAL_2 = 2  # Female

            TVARVAL_1 = 1  # Self-employed
            TVARVAL_2 = 2  # Works for wages

            XVARVAL_LABEL = "SEX"
            TVARVAL_LABEL = "CLASSWKR"

            XVARVAL_SET = [XVARVAL_1, XVARVAL_2]
            TVARVAL_SET = [TVARVAL_1, TVARVAL_2]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET,
                                  TVARVAL_1, XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET)

        elif run_number == 10:
            year_labels = "1910,1920,1930,1940".split(",")
            GEN_LABEL = "ipums_full_census_year{year_label}_blackwhite_selfemployed_worksforwages__x-is-black_t-is-CLASSWKR.csv"

            XVARVAL_1 = 2  # BLACK
            XVARVAL_2 = 1  # WHITE

            TVARVAL_1 = 1  # Self-employed
            TVARVAL_2 = 2  # Works for wages

            XVARVAL_LABEL = "RACE"
            TVARVAL_LABEL = "CLASSWKR"

            XVARVAL_SET = [XVARVAL_1, XVARVAL_2]
            TVARVAL_SET = [TVARVAL_1, TVARVAL_2]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET,
                                  TVARVAL_1, XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET)

        elif run_number == 11:
            year_labels = "1940".split(",")
            GEN_LABEL = "ipums_full_census_year{year_label}_blackwhite_vet_notvet__x-is-black_t-is-vetstat.csv"

            XVARVAL_1 = 2  # BLACK
            XVARVAL_2 = 1  # WHITE

            TVARVAL_1 = 2  # VET
            TVARVAL_2 = 1  # NOT_VET

            XVARVAL_LABEL = "RACE"
            TVARVAL_LABEL = "VETSTAT"

            XVARVAL_SET = [XVARVAL_1, XVARVAL_2]
            TVARVAL_SET = [TVARVAL_1, TVARVAL_2]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET,
                                  TVARVAL_1, XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET)

        elif run_number == 12:
            year_labels = "1930,1940".split(",")
            GEN_LABEL = "ipums_full_census_year{year_label}_blackwhite_unemployed_employed__x-is-black_t-is-EMPSTAT.csv"

            XVARVAL_1 = 2  # BLACK
            XVARVAL_2 = 1  # WHITE

            TVARVAL_1 = 2  # Unemployed
            TVARVAL_2 = 1  # Employed

            XVARVAL_LABEL = "RACE"
            TVARVAL_LABEL = "EMPSTAT"

            XVARVAL_SET = [XVARVAL_1, XVARVAL_2]
            TVARVAL_SET = [TVARVAL_1, TVARVAL_2]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET,
                                  TVARVAL_1, XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET)

        elif run_number == 13:
            year_labels = "1930,1940".split(",")
            GEN_LABEL = "ipums_full_census_year{year_label}_menwomen_unemployed_employed__x-is-men_t-is-EMPSTAT.csv"

            XVARVAL_1 = 1  # Male
            XVARVAL_2 = 2  # Female

            TVARVAL_1 = 2  # Unemployed
            TVARVAL_2 = 1  # Employed

            XVARVAL_LABEL = "SEX"
            TVARVAL_LABEL = "EMPSTAT"

            XVARVAL_SET = [XVARVAL_1, XVARVAL_2]
            TVARVAL_SET = [TVARVAL_1, TVARVAL_2]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET,
                                  TVARVAL_1, XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET)

        elif run_number == 14:
            year_labels = "1880".split(",")
            GEN_LABEL = "ipums_full_census_by_mcd_year{year_label}_blackwhite_notinlaborforce_inlaborforce__x-is-black_t-is-LABFORCE.csv"

            XVARVAL_1 = 2  # BLACK
            XVARVAL_2 = 1  # WHITE

            TVARVAL_1 = 1  # No, not in the labor force
            TVARVAL_2 = 2  # Yes, in the labor force

            XVARVAL_LABEL = "RACE"
            TVARVAL_LABEL = "LABFORCE"

            XVARVAL_SET = [XVARVAL_1, XVARVAL_2]
            TVARVAL_SET = [TVARVAL_1,TVARVAL_2]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET, TVARVAL_1,
                                  XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET, True)
        elif run_number == 15:
            year_labels = "1880".split(",")
            GEN_LABEL = "ipums_full_census_by_mcd_year{year_label}_menwomen_notinlaborforce_inlaborforce__x-is-men_t-is-LABFORCE.csv"

            XVARVAL_1 = 1  # Male
            XVARVAL_2 = 2  # Female

            TVARVAL_1 = 1  # No, not in the labor force
            TVARVAL_2 = 2  # Yes, in the labor force

            XVARVAL_LABEL = "SEX"
            TVARVAL_LABEL = "LABFORCE"

            XVARVAL_SET = [XVARVAL_1, XVARVAL_2]
            TVARVAL_SET = [TVARVAL_1,TVARVAL_2]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET, TVARVAL_1,
                                  XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET, True)


        elif run_number == 16:
            year_labels = "1880".split(",")
            GEN_LABEL = "ipums_full_census_by_mcd_year{year_label}_blackwhite_foreignborn_nativeborn__x-is-black_t-is-NATIVITY.csv"

            XVARVAL_1 = 2  # BLACK
            XVARVAL_2 = 1  # WHITE

            TVARVAL_1 = 5  # Foreign born

            XVARVAL_LABEL = "RACE"
            TVARVAL_LABEL = "NATIVITY"

            XVARVAL_SET = [XVARVAL_1, XVARVAL_2]
            TVARVAL_SET = [1, 2, 3, 4, 5]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET, TVARVAL_1,
                                  XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET, True)
        elif run_number == 17:
            year_labels = "1880".split(",")
            GEN_LABEL = "ipums_full_census_by_mcd_year{year_label}_menwomen_foreignborn_nativeborn__x-is-men_t-is-NATIVITY.csv"

            XVARVAL_1 = 1  # Male
            XVARVAL_2 = 2  # Female

            TVARVAL_1 = 5  # Foreign born

            XVARVAL_LABEL = "SEX"
            TVARVAL_LABEL = "NATIVITY"

            XVARVAL_SET = [XVARVAL_1, XVARVAL_2]
            TVARVAL_SET = [1, 2, 3, 4, 5]

            process_and_save_data(input_csv_data_file, output_dir, year_labels, GEN_LABEL, XVARVAL_1, XVARVAL_SET, TVARVAL_1,
                                  XVARVAL_LABEL, TVARVAL_LABEL, TVARVAL_SET, True)
if __name__ == '__main__':
    sys.exit(main(sys.argv[1:]))



